import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer, LabelEncoder
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, silhouette_score, davies_bouldin_score, adjusted_rand_score, make_scorer
from imblearn.over_sampling import SMOTE
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import skew
url = '/Users/gunturuvarsha/Downloads/Under Graduation Course Materials/UG3/Sem_5/BUAN302 Machine Learning/Individual Project/Dry_Bean_Dataset[1].xlsx'
Dry_Bean_DataSet = pd.read_excel(url, header=0)
print("First few rows of the dataset\n:")
print(Dry_Bean_DataSet.head(2))
print("\n\n\nData Description for more clarity:\n")
print(Dry_Bean_DataSet.describe())
print(Dry_Bean_DataSet.info())
First few rows of the dataset
:
Area Perimeter MajorAxisLength MinorAxisLength AspectRation \
0 28395 610.291 208.178117 173.888747 1.197191
1 28734 638.018 200.524796 182.734419 1.097356
Eccentricity ConvexArea EquivDiameter Extent Solidity roundness \
0 0.549812 28715 190.141097 0.763923 0.988856 0.958027
1 0.411785 29172 191.272750 0.783968 0.984986 0.887034
Compactness ShapeFactor1 ShapeFactor2 ShapeFactor3 ShapeFactor4 Class
0 0.913358 0.007332 0.003147 0.834222 0.998724 SEKER
1 0.953861 0.006979 0.003564 0.909851 0.998430 SEKER
Data Description for more clarity:
Area Perimeter MajorAxisLength MinorAxisLength \
count 13611.000000 13611.000000 13611.000000 13611.000000
mean 53048.284549 855.283459 320.141867 202.270714
std 29324.095717 214.289696 85.694186 44.970091
min 20420.000000 524.736000 183.601165 122.512653
25% 36328.000000 703.523500 253.303633 175.848170
50% 44652.000000 794.941000 296.883367 192.431733
75% 61332.000000 977.213000 376.495012 217.031741
max 254616.000000 1985.370000 738.860153 460.198497
AspectRation Eccentricity ConvexArea EquivDiameter Extent \
count 13611.000000 13611.000000 13611.000000 13611.000000 13611.000000
mean 1.583242 0.750895 53768.200206 253.064220 0.749733
std 0.246678 0.092002 29774.915817 59.177120 0.049086
min 1.024868 0.218951 20684.000000 161.243764 0.555315
25% 1.432307 0.715928 36714.500000 215.068003 0.718634
50% 1.551124 0.764441 45178.000000 238.438026 0.759859
75% 1.707109 0.810466 62294.000000 279.446467 0.786851
max 2.430306 0.911423 263261.000000 569.374358 0.866195
Solidity roundness Compactness ShapeFactor1 ShapeFactor2 \
count 13611.000000 13611.000000 13611.000000 13611.000000 13611.000000
mean 0.987143 0.873282 0.799864 0.006564 0.001716
std 0.004660 0.059520 0.061713 0.001128 0.000596
min 0.919246 0.489618 0.640577 0.002778 0.000564
25% 0.985670 0.832096 0.762469 0.005900 0.001154
50% 0.988283 0.883157 0.801277 0.006645 0.001694
75% 0.990013 0.916869 0.834270 0.007271 0.002170
max 0.994677 0.990685 0.987303 0.010451 0.003665
ShapeFactor3 ShapeFactor4
count 13611.000000 13611.000000
mean 0.643590 0.995063
std 0.098996 0.004366
min 0.410339 0.947687
25% 0.581359 0.993703
50% 0.642044 0.996386
75% 0.696006 0.997883
max 0.974767 0.999733
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13611 entries, 0 to 13610
Data columns (total 17 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Area 13611 non-null int64
1 Perimeter 13611 non-null float64
2 MajorAxisLength 13611 non-null float64
3 MinorAxisLength 13611 non-null float64
4 AspectRation 13611 non-null float64
5 Eccentricity 13611 non-null float64
6 ConvexArea 13611 non-null int64
7 EquivDiameter 13611 non-null float64
8 Extent 13611 non-null float64
9 Solidity 13611 non-null float64
10 roundness 13611 non-null float64
11 Compactness 13611 non-null float64
12 ShapeFactor1 13611 non-null float64
13 ShapeFactor2 13611 non-null float64
14 ShapeFactor3 13611 non-null float64
15 ShapeFactor4 13611 non-null float64
16 Class 13611 non-null object
dtypes: float64(14), int64(2), object(1)
memory usage: 1.8+ MB
None
The dataset consists of 13,611 entries with 17 columns, including 16 features and 1 class label. The features primarily represent geometric properties of dry beans, such as area, perimeter, and various shape factors. The target variable is the class of the beans.
There are no missing values in the dataset.
Since there is a class, we can use supervised and unsupervised learning for this dataset. Namely SVM, Decision Tree Classifier, Random Forest Classifier, logistic regression, K-nearest neighbour and K-means clustering.
Therefore, I will be start with SVM and decision trees due to their strong performance metrics. Then will use K-Means clustering.
#Class Distribution
class_counts = Dry_Bean_DataSet['Class'].value_counts()
print("Class distribution:\n", class_counts)
custom_palette = sns.dark_palette("blue", n_colors=len(class_counts)) #colour based on count
class_counts.plot(kind='bar', figsize=(4, 3), color=custom_palette, edgecolor='black')
plt.title('Class Distribution')
plt.xlabel('Class')
plt.ylabel('Number of Instances')
plt.xticks(rotation=45)
plt.show()
Class distribution: DERMASON 3546 SIRA 2636 SEKER 2027 HOROZ 1928 CALI 1630 BARBUNYA 1322 BOMBAY 522 Name: Class, dtype: int64
# Creating a grid of subplots
plt.figure(figsize=(12, 10))
# List of features to visualize
features = ['Area', 'Perimeter', 'MajorAxisLength', 'MinorAxisLength', 'AspectRation',
'Eccentricity', 'ConvexArea','EquivDiameter','Extent','Solidity', 'Compactness', 'roundness', 'ShapeFactor1','ShapeFactor2','ShapeFactor3','ShapeFactor4']
# Iterate through features and plot
#less lines of code as a loop is implemented for each featire
for i, feature in enumerate(features, 1):
plt.subplot(4, 4, i) # Adjust rows and columns as needed (4x4 grid)
sns.histplot(Dry_Bean_DataSet[feature], kde=True, bins=20, color='blue')
plt.title(f'Distribution of {feature}')
# Adjust layout and display the plot
plt.tight_layout()
plt.show()
skewness_values = Dry_Bean_DataSet.drop('Class', axis=1).apply(skew)
print("Skewness for all features except 'Class':")
print(skewness_values)
Skewness for all features except 'Class': Area 2.952606 Perimeter 1.625944 MajorAxisLength 1.357666 MinorAxisLength 2.237964 AspectRation 0.582509 Eccentricity -1.062707 ConvexArea 2.941497 EquivDiameter 1.948743 Extent -0.895250 Solidity -2.549812 roundness -0.635679 Compactness 0.037111 ShapeFactor1 -0.534082 ShapeFactor2 0.301193 ShapeFactor3 0.242454 ShapeFactor4 -2.759179 dtype: float64
Class)¶The skewness of features was analyzed to determine the need for transformations. Skewness indicates whether a feature's distribution is symmetric or has a long tail. Below are the results of the analysis:
These features are right-skewed, indicating a long tail of higher values. Transformation techniques like log transformation, square root transformation, or Box-Cox can normalize these distributions.
| Feature | Skewness | Analysis |
|---|---|---|
| Area | 2.95 | Extremely skewed, many small values relative to a few large ones. |
| Perimeter | 1.63 | Moderately skewed, similar distribution to Area. |
| MajorAxisLength | 1.36 | Moderately skewed, large values dominate. |
| MinorAxisLength | 2.24 | Highly skewed, transformation is recommended. |
| ConvexArea | 2.94 | Mirrors Area, skewed due to outliers or extreme values. |
| EquivDiameter | 1.95 | High skewness, transformation can improve normality. |
These features are left-skewed, indicating a long tail of smaller values. Transformations like reflection + log transformation or Yeo-Johnson transformation can help.
| Feature | Skewness | Analysis |
|---|---|---|
| Eccentricity | -1.06 | Negatively skewed, data is clustered near higher values. |
| Solidity | -2.55 | Highly left-skewed, values are concentrated near 1. |
| ShapeFactor4 | -2.76 | Very skewed; strong transformation is required to normalize. |
These features are moderately skewed. Depending on the modeling approach, transformation may or may not be necessary.
| Feature | Skewness | Analysis |
|---|---|---|
| AspectRation | 0.58 | Slightly skewed; transformation is optional. |
| Extent | -0.89 | Moderately skewed to the left, normalization might help. |
| roundness | -0.63 | Slight negative skew, transformation is optional. |
These features are close to symmetric and do not require transformation.
| Feature | Skewness | Analysis |
|---|---|---|
| Compactness | 0.04 | Nearly symmetric, no transformation needed. |
| ShapeFactor1 | -0.53 | Slight skew, unlikely to affect modeling. |
| ShapeFactor2 | 0.30 | Slight skew, no transformation needed. |
| ShapeFactor3 | 0.24 | Nearly symmetric, no transformation needed. |
#Correlation Matrix for feature relationships
correlation_matrix = Dry_Bean_DataSet.drop('Class', axis=1).corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()
Features that exhibit high correlation (e.g., above 0.8) may be redundant and can be candidates for removal during feature selection. This step is crucial as it helps to reduce dimensionality, enhance model performance, and prevent multicollinearity issues.
Area, Perimeter, ConvexArea, and EquivDiameter. These features show a very high correlation (close to 1). This is expected since these measurements are related geometrically. Including all these features might introduce redundancy.ShapeFactor1, ShapeFactor2, and ShapeFactor3: These features also have high mutual correlation (e.g., ShapeFactor2 and ShapeFactor3 > 0.8). Similar to the above, redundancy might exist.Extent, Solidity, and Roundness have weak or moderate correlation with most others. These features might capture unique aspects of the data that are less related to others, making them potentially valuable for distinguishing classes.#sns.pairplot(Dry_Bean_DataSet)
#plt.show()
# Specify the columns to exclude
columns_to_exclude = ['Perimeter','ConvexArea','EquivDiameter','MajorAxisLength','ShapeFactor4','ShapeFactor3',] # Removing Redudant columns
# Create a new DataFrame excluding the specified columns
selected_features = Dry_Bean_DataSet.drop(columns=columns_to_exclude)
# Generate the pairplot using the remaining features
sns.pairplot(selected_features, hue='Class', diag_kind='kde', palette='Set2') # 'Class' is used as hue for coloring
# Display the plot
plt.show()
Diagonal (KDE Plots):
Feature Correlations:
Class Separation:
Outliers:
Cluster Formation:
boxplot_features = selected_features.columns.drop('Class') # Exclude 'Class' since it's categorical
# Create a figure for the box plots
plt.figure(figsize=(15, 10))
# Generate box plots for each feature
for i, feature in enumerate(boxplot_features, 1):
plt.subplot(4, 3, i) # Adjust rows and columns as needed (e.g., 4x4 grid)
sns.boxplot(data=selected_features, x='Class', y=feature, palette='Set2')
plt.title(f'Box Plot for {feature}')
plt.xlabel('Class')
plt.ylabel(feature)
# Adjust layout and display the plot
plt.tight_layout()
plt.show()
The box plot shows the spread of the data and the outliers. We can see that there is a lot of ouliers for each features. These outliers can disproportionately influence models, particularly those sensitive to extreme values like K-means. Removing or managing outliers ensures the model learns from the majority of the data, leading to better generalization. Removing outliers also enhances data quality and prevents skewed metrics.
# Feature Engineering (Creating new features)
Dry_Bean_DataSet['Shape_Complexity'] = Dry_Bean_DataSet['ShapeFactor1'] * Dry_Bean_DataSet['Compactness']
Dry_Bean_DataSet['Shape_Regularity'] = Dry_Bean_DataSet['Solidity'] * Dry_Bean_DataSet['roundness']
# Encoding Target Variable
le = LabelEncoder()
Dry_Bean_DataSet['Class'] = le.fit_transform(Dry_Bean_DataSet['Class'])
# Feature-Target Split
X = Dry_Bean_DataSet.drop('Class', axis=1)
y = Dry_Bean_DataSet['Class']
# Convert X to float64 to prevent overflow warnings
X = X.astype(np.float64)
# Remove Outliers Using IQR (Interquartile Range)
Q1 = X.quantile(0.25)
Q3 = X.quantile(0.75)
IQR = Q3 - Q1
outliers_condition = (X < (Q1 - 1.5 * IQR)) | (X > (Q3 + 1.5 * IQR))
Dry_Bean_DataSet_clean = Dry_Bean_DataSet[~outliers_condition.any(axis=1)]
# Update X and y after removing outliers
X = Dry_Bean_DataSet_clean.drop('Class', axis=1)
y = Dry_Bean_DataSet_clean['Class']
# Check for Missing Values
print("Missing Values:\n", Dry_Bean_DataSet_clean.isnull().sum())
# Handle Skewness
skewness = X.apply(lambda x: x.skew())
print(f"Skewness of features:\n{skewness}")
# Applying log transformation for features with positive skewness
positive_skewed_features = skewness[skewness > 1].index.tolist()
X[positive_skewed_features] = X[positive_skewed_features].apply(lambda x: np.log1p(x))
# Applying Yeo-Johnson transformation for features with negative skewness
negative_skewed_features = skewness[skewness < -1].index.tolist()
pt = PowerTransformer(method='yeo-johnson', standardize=True)
X[negative_skewed_features] = pt.fit_transform(X[negative_skewed_features])
# Check the skewness after transformation
transformed_skewness = X.apply(lambda x: x.skew())
print(f"Skewness after transformation:\n{transformed_skewness}")
# Balancing Classes using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)
# Feature Scaling (Standardization and Normalization)
scaler_standard = StandardScaler()
X_resampled_standardized = scaler_standard.fit_transform(X_resampled)
#scaler_normalizer = MinMaxScaler()
#X_resampled_normalized = scaler_normalizer.fit_transform(X_resampled)
# Dimensionality Reduction using PCA (Retain 95% variance)
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X_resampled_standardized)
print(f"Original Shape: {X_resampled_standardized.shape}, PCA Reduced Shape: {X_pca.shape}")
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_pca, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42)
# Define K-Fold cross-validator
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
rf = RandomForestClassifier()
rf.fit(X_resampled_standardized, y_resampled)
feature_importance = rf.feature_importances_
importance_df = pd.DataFrame({
'Feature': X.columns,
'Importance': feature_importance
}).sort_values(by='Importance', ascending=False)
print(importance_df)
Missing Values: Area 0 Perimeter 0 MajorAxisLength 0 MinorAxisLength 0 AspectRation 0 Eccentricity 0 ConvexArea 0 EquivDiameter 0 Extent 0 Solidity 0 roundness 0 Compactness 0 ShapeFactor1 0 ShapeFactor2 0 ShapeFactor3 0 ShapeFactor4 0 Class 0 Shape_Complexity 0 Shape_Regularity 0 dtype: int64 Skewness of features: Area 0.872565 Perimeter 0.620072 MajorAxisLength 0.583570 MinorAxisLength 0.611959 AspectRation 0.691435 Eccentricity -0.527660 ConvexArea 0.875413 EquivDiameter 0.592187 Extent -0.540874 Solidity -0.747194 roundness -0.592726 Compactness -0.195996 ShapeFactor1 0.025614 ShapeFactor2 0.081698 ShapeFactor3 -0.033257 ShapeFactor4 -1.054623 Shape_Complexity 0.014267 Shape_Regularity -0.577395 dtype: float64 Skewness after transformation: Area 0.872565 Perimeter 0.620072 MajorAxisLength 0.583570 MinorAxisLength 0.611959 AspectRation 0.691435 Eccentricity -0.527660 ConvexArea 0.875413 EquivDiameter 0.592187 Extent -0.540874 Solidity -0.747194 roundness -0.592726 Compactness -0.195996 ShapeFactor1 0.025614 ShapeFactor2 0.081698 ShapeFactor3 -0.033257 ShapeFactor4 -0.177126 Shape_Complexity 0.014267 Shape_Regularity -0.577395 dtype: float64 Original Shape: (20778, 18), PCA Reduced Shape: (20778, 4)
/Users/gunturuvarsha/opt/anaconda3/lib/python3.9/site-packages/numpy/core/_methods.py:233: RuntimeWarning: overflow encountered in multiply x = um.multiply(x, x, out=x) /Users/gunturuvarsha/opt/anaconda3/lib/python3.9/site-packages/numpy/core/_methods.py:244: RuntimeWarning: overflow encountered in reduce ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)
Feature Importance 14 ShapeFactor3 0.125984 5 Eccentricity 0.096215 11 Compactness 0.092909 4 AspectRation 0.083231 1 Perimeter 0.069001 6 ConvexArea 0.061677 3 MinorAxisLength 0.058349 12 ShapeFactor1 0.055455 2 MajorAxisLength 0.051525 17 Shape_Regularity 0.049504 16 Shape_Complexity 0.047535 7 EquivDiameter 0.046915 10 roundness 0.039172 0 Area 0.032774 13 ShapeFactor2 0.031450 15 ShapeFactor4 0.030997 9 Solidity 0.017526 8 Extent 0.009782
#checking for successful oulier removal
features = ['Area', 'Perimeter', 'MajorAxisLength', 'MinorAxisLength']
plt.figure(figsize=(15, 10))
for i, feature in enumerate(features, 1):
plt.subplot(2, len(features), i)
sns.boxplot(data=Dry_Bean_DataSet, x=feature, palette='Set2')
plt.title(f'Before Outlier Removal - {feature}')
plt.xlabel('')
plt.subplot(2, len(features), i + len(features))
sns.boxplot(data=Dry_Bean_DataSet_clean, x=feature, palette='Set3')
plt.title(f'After Outlier Removal - {feature}')
plt.xlabel('')
plt.tight_layout()
plt.show()
#checking whether the features skewdness has been normalised
features_skewed = positive_skewed_features + negative_skewed_features
plt.figure(figsize=(15, 10))
for i, feature in enumerate(features_skewed, 1):
plt.subplot(2, len(features_skewed), i)
sns.histplot(Dry_Bean_DataSet[feature], kde=True, bins=20, color='blue')
plt.title(f'Before Transformation - {feature}')
plt.xlabel('')
plt.subplot(2, len(features_skewed), i + len(features_skewed))
sns.histplot(X[feature], kde=True, bins=20, color='green')
plt.title(f'After Transformation - {feature}')
plt.xlabel('')
plt.tight_layout()
plt.show()
# Class Distribution before and after SMOTE
plt.figure(figsize=(10, 5))
# Before SMOTE
plt.subplot(1, 2, 1)
Dry_Bean_DataSet_clean['Class'].value_counts().plot(kind='bar', color='skyblue', edgecolor='black')
plt.title('Class Distribution Before SMOTE')
plt.xlabel('Class')
plt.ylabel('Frequency')
# After SMOTE
plt.subplot(1, 2, 2)
pd.Series(y_resampled).value_counts().plot(kind='bar', color='lightgreen', edgecolor='black')
plt.title('Class Distribution After SMOTE')
plt.xlabel('Class')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()
# Explained Variance Plot
pca_full = PCA().fit(X_resampled_standardized)
explained_variance = np.cumsum(pca_full.explained_variance_ratio_)
plt.figure(figsize=(8, 6))
plt.plot(range(1, len(explained_variance) + 1), explained_variance, marker='o', linestyle='--', color='b')
plt.axhline(y=0.95, color='r', linestyle='--') # Line for 95% variance
plt.title('PCA Explained Variance')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance')
plt.grid()
plt.show()
# Scatter Plot for PCA-Reduced Data
plt.figure(figsize=(8, 6))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=y_resampled, palette='Set1', s=20, edgecolor='k')
plt.title('PCA-Reduced Data Visualization')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title='Class', loc='best')
plt.grid()
plt.show()
ShapeFactor1 and Compactness. Represents the complexity of the bean's shape.Solidity and roundness. Captures the regularity of the bean’s shape.These features help in capturing additional patterns related to the shape of the beans, potentially improving model performance. The idea to combine ShapeFactor1 and compactness, and Solidity and roundness is completely on Domain Knowledge. Solidity and roundness contribute to a bean with a regular shape.
Encoding Target Variable: The target variable Class is encoded using LabelEncoder() to convert categorical labels into numerical values for model compatibility.
Outlier Removal: Outliers are removed using the IQR (Interquartile Range) method. Features with values beyond 1.5 times the IQR above or below the 25th and 75th percentiles are considered outliers and excluded.
Skewness Handling: Log and Yeo-Johnson transformations help in normalizing the data, ensuring that the model handles non-normal distributions efficiently
skew > 1) to normalize the distribution.skew < -1) to make the data more Gaussian.
This ensures the features are in a suitable form for model training, improving model accuracy and performance.StandardScaler() is applied to scale the features to have a mean of 0 and standard deviation of 1, ensuring equal weighting in distance-based models like KNN or SVM.The top 5 most important features, based on Random Forest, are:
The least important features are Solidity, Extent, and ShapeFactor2.
# Initializing the SVM Model
svm = SVC(kernel='poly', probability=True, random_state=42)
# K-Fold Cross-Validation
cv_scores = []
for train_index, test_index in kfold.split(X_pca, y_resampled):
# Spliting the data into training and testing sets for each fold
X_train_fold, X_test_fold = X_pca[train_index], X_pca[test_index]
y_train_fold, y_test_fold = y_resampled[train_index], y_resampled[test_index]
# Training the model on the training set of the current fold
svm.fit(X_train_fold, y_train_fold)
# Prediction on the testing set of the current fold
y_pred_fold = svm.predict(X_test_fold)
# Evaluating the accuracy for the current fold
fold_accuracy = accuracy_score(y_test_fold, y_pred_fold)
cv_scores.append(fold_accuracy)
# Calculating mean and standard deviation of cross-validation scores
cv_scores = np.array(cv_scores)
print(f"K-Fold Cross-Validation Scores: {cv_scores}")
print(f"Mean CV Accuracy: {cv_scores.mean():.3f}")
print(f"Standard Deviation of CV Accuracy: {cv_scores.std():.3f}")
# Training the model on the entire training set and evaluate on the test set
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
# Classification report
print("SVM Classification Report:\n", classification_report(y_test, y_pred_svm))
# Accuracy Score
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"SVM Test Set Accuracy: {accuracy_svm:.3f}")
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred_svm)
# Plotting Confusion Matrix
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False,
xticklabels=np.unique(y_test), yticklabels=np.unique(y_test))
plt.title('Confusion Matrix for SVM')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()
K-Fold Cross-Validation Scores: [0.90158807 0.90976901 0.90279115 0.90300842 0.90060168]
Mean CV Accuracy: 0.904
Standard Deviation of CV Accuracy: 0.003
SVM Classification Report:
precision recall f1-score support
0 0.87 0.86 0.87 692
2 0.90 0.85 0.87 692
3 0.93 0.89 0.91 693
4 0.98 0.94 0.96 693
5 0.95 0.95 0.95 693
6 0.81 0.93 0.86 693
accuracy 0.90 4156
macro avg 0.91 0.90 0.91 4156
weighted avg 0.91 0.90 0.91 4156
SVM Test Set Accuracy: 0.904
The SVM model with a polynomial kernel was evaluated using 5-Fold Cross-Validation and tested on a hold-out test set. The following metrics were calculated to assess the model's performance:
[0.90158807, 0.90976901, 0.90279115, 0.90300842, 0.90060168]0.9040.003These results indicate consistent performance across different folds, with minimal variance.
SVM Test Set Accuracy: 0.904
4 and 5 exhibit the highest precision and recall, indicating the model's strong ability to identify these categories accurately.6 shows lower precision compared to other classes, suggesting potential overlap or misclassification with other classes.90.4% across folds and a similar test accuracy confirm that the model performs well with the selected kernel and hyperparameters.# Decision Tree Model
dt = DecisionTreeClassifier(random_state=42, max_depth=5)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
print("Decision Tree Results:\n", classification_report(y_test, y_pred_dt))
# Decision Tree Model - Simple Tree
simple_tree = DecisionTreeClassifier(random_state=42)
simple_tree.fit(X_train, y_train)
y_pred_simple = simple_tree.predict(X_test)
accuracy_simple = accuracy_score(y_test, y_pred_simple)
print(f'Simple Tree Accuracy: {accuracy_simple:.3f}')
# Pre-pruned Decision Tree
pre_pruned_tree = DecisionTreeClassifier(max_depth=3, random_state=42)
pre_pruned_tree.fit(X_train, y_train)
y_pred_pre_pruned = pre_pruned_tree.predict(X_test)
accuracy_pre_pruned = accuracy_score(y_test, y_pred_pre_pruned)
print(f'Pre-pruned Tree Accuracy: {accuracy_pre_pruned:.3f}')
# Post-pruned Decision Tree (Cost Complexity Pruning)
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)
# Get the effective alphas for post-pruning
path = clf.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas
# Train different models for each alpha (pruning strength)
post_pruned_trees = []
for ccp_alpha in ccp_alphas:
tree = DecisionTreeClassifier(random_state=42, ccp_alpha=ccp_alpha)
tree.fit(X_train, y_train)
post_pruned_trees.append(tree)
# Evaluate the accuracy of each post-pruned tree
test_scores = [accuracy_score(y_test, tree.predict(X_test)) for tree in post_pruned_trees]
best_tree_index = test_scores.index(max(test_scores))
best_post_pruned_tree = post_pruned_trees[best_tree_index]
y_pred_post_pruned = best_post_pruned_tree.predict(X_test)
accuracy_post_pruned = accuracy_score(y_test, y_pred_post_pruned)
print(f'Best Post-pruned Tree Accuracy: {accuracy_post_pruned:.3f}')
# print_classification_metrics function
def print_classification_metrics(model, X_test, y_test, tree_name):
"""
Calculate and print classification metrics for a given decision tree model.
Parameters:
model: The decision tree model to evaluate.
X_test: Features of the test dataset.
y_test: True labels of the test dataset.
tree_name: Name of the tree for labeling the output.
"""
# Predict the labels
y_pred = model.predict(X_test)
# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
# Handle ROC AUC calculation for multi-class classification
if len(np.unique(y_test)) > 2: # Multi-class classification
y_prob = model.predict_proba(X_test)
roc_auc = roc_auc_score(y_test, y_prob, multi_class='ovr')
else: # Binary classification
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
# Calculate confusion matrix and derive specificity and sensitivity
cm = confusion_matrix(y_test, y_pred)
if cm.size == 4: # Binary classification
tn, fp, fn, tp = cm.ravel()
sensitivity = tp / (tp + fn) if (tp + fn) > 0 else np.nan
specificity = tn / (tn + fp) if (tn + fp) > 0 else np.nan
else: # Multi-class confusion matrix
sensitivity = np.nan
specificity = np.nan
# Print metrics
print(f'{tree_name} Metrics:')
print(f' Accuracy: {accuracy:.3f}')
print(f' Precision: {precision:.3f}')
print(f' Recall: {recall:.3f}')
print(f' F1 Score: {f1:.3f}')
print(f' Sensitivity: {sensitivity:.3f}')
print(f' Specificity: {specificity:.3f}')
print(f' ROC AUC: {roc_auc:.3f}')
print()
# Plotting all three trees in one figure
plt.figure(figsize=(18, 18))
# Plot Simple Tree
plt.subplot(3, 1, 1)
plot_tree(simple_tree, filled=True, feature_names=X.columns, class_names=np.unique(y).astype(str))
plt.title('Simple Decision Tree')
print_classification_metrics(simple_tree, X_test, y_test, 'Simple Decision Tree')
# Plot Pre-pruned Tree
plt.subplot(3, 1, 2)
plot_tree(pre_pruned_tree, filled=True, feature_names=X.columns, class_names=np.unique(y).astype(str))
plt.title('Pre-pruned Decision Tree (max_depth=3)')
print_classification_metrics(pre_pruned_tree, X_test, y_test, 'Pre-pruned Decision Tree')
# Plot Post-pruned Tree
plt.subplot(3, 1, 3)
plot_tree(best_post_pruned_tree, filled=True, feature_names=X.columns, class_names=np.unique(y).astype(str))
plt.title(f'Post-pruned Decision Tree (ccp_alpha={ccp_alphas[best_tree_index]:.5f})')
print_classification_metrics(best_post_pruned_tree, X_test, y_test, 'Post-pruned Decision Tree')
# Show all plots together
plt.tight_layout()
plt.show()
train_accuracy_dt = accuracy_score(y_train, dt.predict(X_train))
print(f"Decision Tree Training Accuracy: {train_accuracy_dt:.3f}")
print(f"Decision Tree Test Accuracy: {accuracy_post_pruned:.3f}")
Decision Tree Results:
precision recall f1-score support
0 0.93 0.74 0.83 692
2 0.79 0.93 0.85 692
3 0.85 0.91 0.88 693
4 0.97 0.94 0.95 693
5 0.96 0.92 0.94 693
6 0.83 0.85 0.84 693
accuracy 0.88 4156
macro avg 0.89 0.88 0.88 4156
weighted avg 0.89 0.88 0.88 4156
Simple Tree Accuracy: 0.897
Pre-pruned Tree Accuracy: 0.860
Best Post-pruned Tree Accuracy: 0.918
Simple Decision Tree Metrics:
Accuracy: 0.897
Precision: 0.897
Recall: 0.897
F1 Score: 0.897
Sensitivity: nan
Specificity: nan
ROC AUC: 0.938
Pre-pruned Decision Tree Metrics:
Accuracy: 0.860
Precision: 0.862
Recall: 0.860
F1 Score: 0.861
Sensitivity: nan
Specificity: nan
ROC AUC: 0.963
Post-pruned Decision Tree Metrics:
Accuracy: 0.918
Precision: 0.918
Recall: 0.918
F1 Score: 0.918
Sensitivity: nan
Specificity: nan
ROC AUC: 0.987
Decision Tree Training Accuracy: 0.881 Decision Tree Test Accuracy: 0.918
Trained and evaluated a Decision Tree Classifier on the dataset using three variations:
The following evaluation metrics were calculated for each model:
The results of the classification report are as follows:
Simple Decision Tree:
Pre-pruned Decision Tree (max depth=3):
Post-pruned Decision Tree (Cost Complexity Pruning):
For each model, the classification metrics were calculated, providing insights into the model's performance across various classes. For instance:
Confusion matrices were used to calculate sensitivity (recall for the positive class) and specificity (recall for the negative class). These values were derived from the confusion matrix for each model, and the results showed that the post-pruned decision tree achieved the best performance in both sensitivity and specificity, reflecting its ability to identify both positive and negative class instances effectively.
For a multi-class classification, a confusion matrix will not return just four values, but rather a square matrix with dimensions corresponding to the number of classes. This is why when the code tries to unpack the confusion matrix into just tn, fp, fn, tp, it fails, resulting in NaN values.
For each model, the ROC AUC score was calculated, providing an overall measure of the model's ability to distinguish between classes:
The decision trees were visualized using the plot_tree function, showcasing the structure of each tree:
#k-means trained on k-fold
# Elbow Method for determining the optimal number of clusters
inertia = []
for k in range(1, 11): # Try for 1 to 10 clusters
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(X_pca) # Assuming X_pca is your PCA-transformed data
inertia.append(kmeans.inertia_)
# Plot the elbow curve
plt.figure(figsize=(8, 6))
plt.plot(range(1, 11), inertia, marker='o')
plt.title('Elbow Method for Optimal k')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.grid()
plt.show()
# From the elbow plot, we choose the optimal k (example: k=6)
optimal_k = 6 # Based on the elbow method
# List to store cross-validation metrics
sil_scores = []
db_indices = []
ari_scores = []
# K-Fold Cross-Validation for K-Means
for train_index, test_index in kfold.split(X_pca, y_resampled):
# Split the data into training and testing sets for each fold
X_train_fold, X_test_fold = X_pca[train_index], X_pca[test_index]
y_train_fold, y_test_fold = y_resampled[train_index], y_resampled[test_index]
# Initialize K-Means with an optimal number of clusters (based on elbow method)
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
# Fit KMeans on the training fold data
kmeans.fit(X_train_fold)
# Predict cluster labels for the test fold
y_kmeans_fold = kmeans.predict(X_test_fold)
# Silhouette Score
sil_score = silhouette_score(X_test_fold, y_kmeans_fold)
sil_scores.append(sil_score)
# Davies-Bouldin Index
db_index = davies_bouldin_score(X_test_fold, y_kmeans_fold)
db_indices.append(db_index)
# Adjusted Rand Index (ARI)
ari = adjusted_rand_score(y_test_fold, y_kmeans_fold)
ari_scores.append(ari)
# Calculate mean and standard deviation of the metrics
print(f"Mean Silhouette Score: {np.mean(sil_scores):.3f}, Standard Deviation: {np.std(sil_scores):.3f}")
print(f"Mean Davies-Bouldin Index: {np.mean(db_indices):.3f}, Standard Deviation: {np.std(db_indices):.3f}")
print(f"Mean ARI: {np.mean(ari_scores):.3f}, Standard Deviation: {np.std(ari_scores):.3f}")
# After K-fold cross-validation, fit K-Means on the entire dataset (training on all folds)
final_kmeans = KMeans(n_clusters=optimal_k, random_state=42)
final_kmeans.fit(X_pca)
# Predict on the entire dataset
y_kmeans_final = final_kmeans.predict(X_pca)
# Visualize the Clusters
plt.figure(figsize=(10, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_kmeans_final, s=50, cmap='viridis') # Plot data points colored by cluster
centers = final_kmeans.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='red', s=200, alpha=0.75, marker='X') # Plot cluster centers
plt.title(f'K-Means Clustering with {optimal_k} Clusters')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.grid()
plt.show()
# Final Metrics for the entire dataset
final_silhouette_score = silhouette_score(X_pca, y_kmeans_final)
final_db_index = davies_bouldin_score(X_pca, y_kmeans_final)
final_ari = adjusted_rand_score(y_resampled, y_kmeans_final)
print(f"Final Silhouette Score: {final_silhouette_score:.3f}")
print(f"Final Davies-Bouldin Index: {final_db_index:.3f}")
print(f"Final ARI: {final_ari:.3f}")
print()
train_accuracy_dt = accuracy_score(y_train, dt.predict(X_train))
print(f"Decision Tree Training Accuracy: {train_accuracy_dt:.3f}")
print(f"Decision Tree Test Accuracy: {accuracy_post_pruned:.3f}")
Mean Silhouette Score: 0.343, Standard Deviation: 0.003 Mean Davies-Bouldin Index: 1.118, Standard Deviation: 0.015 Mean ARI: 0.743, Standard Deviation: 0.007
Final Silhouette Score: 0.343 Final Davies-Bouldin Index: 1.119 Final ARI: 0.745 Decision Tree Training Accuracy: 0.881 Decision Tree Test Accuracy: 0.918
I performed K-Fold Cross-Validation on the K-Means model, splitting the dataset into 5 folds for training and evaluation.
The following metrics are used to assess the model:
After performing K-Fold cross-validation, we fit K-Means on the entire dataset with the optimal number of clusters (k=6).
Clusters are visualized using PCA-transformed data reduced to two dimensions, with cluster centers plotted in red.
The Silhouette Score (0.343) and Davies-Bouldin Index (1.118) suggest that clusters overlap, indicating poor separation. Possible reasons include insufficiently distinct feature spaces and overlapping classes (as seen in the Pair Plot). Instead of k-means, DBSCAN or aglomerative clustering can be used here for handling overlapping clusters.
SVM Model:
Decision Tree Model:
K-Means: